Introduction

This is an investigation on the use of Topic Modeling on the course catalog at Florida Polytechnic University.

library(tidytext)
library(textmineR)
library(broom)
library(tidyr)
library(dplyr)
library(ggplot2)
library(here)
library(stringr)
library(DT)
library(proxy)
library(smacof)
library(ggrepel)
# library(MASS)
library(stringdist)
library(widyr)
library(igraph)
library(ggraph)
ptsize <- 2
legends <- FALSE
dist_lmt <- 3.4
library(readr, quietly = T)

set.seed(543)
source(here("scripts/functions/transform_course_data.R"))
data <- read_csv(here("data/courses-list-fpu.csv"))
classes2remove <- c("FIN_2001", "FIN2002", "EGN_4941", "COP_3337", "CAP_4739", "EEE_5935", "CNT_3502", 
                    "COP_3834C")


filter_regex = ""
replace_regex = ""
filtering_string <- regex(paste0("^Week|\\s{2}|^\\s{1}|^Quiz|^Chapter|^Case|^http|Ch.|^Incoterms|Exam|Presentations|www|", 
                                   filter_regex), 
                            ignore_case = TRUE)
replace_string <- regex(paste0("^\\d{1}\\. |\\d{2}\\. |^Lab \\d{1}. |^Lab \\d{2}. |^[a-z]. |^\\d{1}.|", 
                               replace_regex), 
                        ignore_case = TRUE)

data <- clean_columns(data) 


# This will eventually be its own script
outl_df <- data %>% 
  mutate(new_col = strsplit(as.character(Course_Description), "[\\\r\\\n\\\t]+")) 

main_outl_df <- outl_df %>% tidyr::separate_rows(new_col, sep = "^[0-9].")  %>% filter(!grepl("^\\d{1}\\. |\\d{2}\\. ", new_col))
# Getting everything else just in case we need them
side_outl_df <- outl_df %>% tidyr::separate_rows(new_col, sep = "^[0-9].") %>% filter(!grepl(paste0("^\\d{1}\\. |\\d{2}\\. "), new_col)) 

main_outl_df$new_col <- main_outl_df$new_col %>% 
  str_replace_all(replace_string, "")


# Separate 
side_outl_new <- side_outl_df %>% 
  filter(!grepl(filtering_string, 
                new_col, ignore.case = TRUE)) %>% 
  filter(!is.na(new_col))


side_outl_new$new_col <- side_outl_new$new_col %>% str_replace_all(replace_string, "")
# Joining the two dataframes for the new_col
full_outl <- main_outl_df %>% rbind(side_outl_new)

# Filtering to the department
full_outl <- full_outl %>% 
  filter(Department_Name == params$department) %>% 
  filter(Is_Active == 1)
  # filter(Department_Name == "Computer Science")
  # filter(Department_Name == "Data Science and Business Analytics")


# Getting bigrams
terms_bigram <- full_outl %>% 
  select(c(Course_ID, new_col)) %>% 
  unnest_tokens("desc_word", new_col, token = "ngrams", n = 2) %>% 
  separate(desc_word, c("word1", "word2")) %>% 
  filter(!word1 %in% c(stop_words$word, "research", "scientific", "paper", "guest", "topics", "based", "covers", "current", "toolset", "current", "student", "unknown", "senior", "relevant", "term", "skills")) %>%
  filter(!grepl("^[0-9]", word1)) %>% 
  filter(!word2 %in% c(stop_words$word, "include", "information", "sources", "project", "term", "base")) %>% 
  filter(!grepl("^[0-9]", word2)) %>% 
  unite(desc_bigram, word1, word2, sep = " ") %>% 
  filter(!desc_bigram == "NA NA")

bigram_dtm <- terms_bigram %>% 
  count(Course_ID, desc_bigram, sort = TRUE) %>% 
  cast_dtm(Course_ID, desc_bigram, n)

# List of course_id matched to course names 
course_list <- split(full_outl$Name, full_outl$Course_ID)

LDA

The first test is an LDA model with k = 5 using the Gibbs method.

library(topicmodels)
# k = 5 for the number of concentrations
bigram_lda <- LDA(bigram_dtm, k = ifelse(params$department == "Computer Science", 6, 5), method = "Gibbs", control=list(iter = 500, verbose = 25, alpha = 0.2))
## K = 5; V = 877; M = 73
## Sampling 500 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Gibbs sampling completed!

Probabilities

Topic-Term Probabilities (Betas)

course_topics <- tidy(bigram_lda, matrix = "beta")
course_topics %>% 
  datatable()

Document-Topic Probabilities (Gamma)

course_docs <- tidy(bigram_lda, matrix = "gamma")

course_docs %>% 
  group_by(topic) %>%
  slice_max(gamma, n = 5) %>% 
  ungroup() %>%
  arrange(topic, -gamma) %>% 
  datatable()
# course_top_docs %>%
#   mutate(document = reorder_within(document, gamma, topic)) %>%
#   ggplot(aes(gamma, document, fill = factor(document))) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~ topic, scales = "free") +
#   scale_y_reordered()

Can it identify the five different Concentrations?

The five concentrations are as follows:
- Logistics & Supply Chain Management
- Intelligent Mobility
- Quantitative Economics and Econometrics
- Big Data Analytics
- Health Systems Engineering

The LDA model we have seems to be able to spread the topics pretty well. But there seems to be a shortcoming in its ability to separate one concentration from one another. I believe this is due to the fact that a lot of the DSBA curriculum overlaps in many ways.

course_top_terms <- course_topics %>% 
  filter(!is.na(term)) %>% 
  group_by(topic) %>%
  arrange(-beta) %>% 
  slice_head(n = 30) %>% 
  ungroup() %>%
  arrange(topic, -beta)

course_top_terms %>% 
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  
  scale_y_reordered() 

# This will save it to the latex document directory so I do not have to manually move everything 
ggsave(paste0("final-doc/Content/images/", 
              ifelse(
                params$department == "Computer Science",
                "lda_cs.png",
                "lda.png"
              )))

ggsave("img/lda.png", dpi = 300)

Distance metrics

There are some distance metrics I would like to try
- Hellinger Distance (In-progress) - Cosine Similarity (Isn’t this done when using MCA/CA?)
- Jaccard Similary (In-progress) - Sorensen-Dice Similarity (In-progress)

Euclidean Distance

dist_euc <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_dist(item = document, feature = term, value = count, method = "euclidean") 

dist_euc %>%
  filter(distance < dist_lmt) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = -distance), show.legend = legends) +
  geom_node_point(color = "#532d8e", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()+ 
  labs(title = "Distance Plot: Euclidean")

Manhattan Distance

dist_manh <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_dist(item = document, feature = term, value = count, method = "manhattan") 

dist_manh %>%
  filter(distance < 15) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = -distance), show.legend = legends) +
  geom_node_point(color = "#532d8e", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "Distance Plot: Manhattan")

Cosine Distance

sim_cos <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_similarity(item = document, feature = term, value = count)

sim_cos %>% 
  mutate(distance = 1 - similarity) %>% 
  filter(distance < ifelse(params$department == "Computer Science", 0.9, 0.02)) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = -distance), show.legend = legends) +
  geom_node_point(color = "#532d8e", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "1 - Cosine Similarity Plot: Bigrams")

Burrow’s Delta

delta_brw <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_delta(item = document, feature = term, value = count, method = "burrows")

delta_brw %>% 
  filter(delta < 0.1) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = -delta), show.legend = legends) +
  geom_node_point(color = "#532d8e", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "Burrows Delta")

Linear Delta

delta_lnr <- bigram_dtm %>% 
  tidy() %>% 
  pairwise_delta(item = document, feature = term, value = count, method = "argamon")

delta_lnr %>% 
  filter(delta < 0.029) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = -delta), show.legend = legends) +
  geom_node_point(color = "#532d8e", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "Argamon's Linear Delta")

Distance metrics with full course descriptions

This is the same as the graphs above, but using the full course descriptions by: - Getting each word individually
- removing the stop words (a, the, and)
- Rejoining all of the descriptions together
- Computing the distance matrices based on the full descriptions

# Doing what I said above
course_full_desc <- full_outl %>% 
  select(c(Course_ID, new_col)) %>% 
  unnest_tokens("word", new_col) %>% 
  filter(!word %in% stop_words$word) %>% 
  filter(!grepl("[0-9]", word)) %>% 
  group_by(Course_ID) %>% 
  summarise(text = str_c(word, collapse = " ")) %>% 
  ungroup() %>% 
  filter(!is.na(text))

Cosine Similarity

cos_mat <- stringdistmatrix(course_full_desc$text, course_full_desc$text, useNames = FALSE, method = "cosine") %>% 
  as.matrix()

colnames(cos_mat) <- course_full_desc$Course_ID
rownames(cos_mat) <- course_full_desc$Course_ID

cos_course <- reshape2::melt(cos_mat)[reshape2::melt(upper.tri(cos_mat))$value,]

colnames(cos_course) <- c("Term1", "Term2", "distance")


cos_course %>% 
  filter(distance < 0.02) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = -distance), show.legend = FALSE) +
  geom_node_point(color = "#532d8e", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "1 - Cosine Similarity Plot: Full Desc.")

ggsave(paste0("final-doc/Content/images/", 
              ifelse(
                params$department == "Computer Science",
                "cos_cs.png",
                "cos.png"
              )))

ggsave("img/cos.png", dpi = 300)

Jaccard Similarity

jac_mat <- stringdistmatrix(course_full_desc$text, course_full_desc$text, useNames = FALSE, method = "jaccard") %>% 
  as.matrix()

colnames(jac_mat) <- course_full_desc$Course_ID
rownames(jac_mat) <- course_full_desc$Course_ID

jac_course <- reshape2::melt(jac_mat)[reshape2::melt(upper.tri(jac_mat))$value,]

colnames(jac_course) <- c("Term1", "Term2", "distance")


jac_course %>% 
  filter(distance < 0.04) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = -distance), show.legend = FALSE) +
  geom_node_point(color = "#532d8e", size = ptsize) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() + 
  labs(title = "1 - Jaccard Similarity Plot: Full Desc.")

ggsave(paste0("final-doc/Content/images/", 
              ifelse(
                params$department == "Computer Science",
                "jac_cs.png",
                "jac.png"
              )))

ggsave("img/jac.png", dpi = 300)

MDS

All MDS implementations are nonmetric (for ordinal data).

MDS with Euclidean Distance

library(plotly)
mds_euc <- bigram_dtm %>% 
  stats::dist(method = "euclidean") %>% 
  # t() %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_euc$conf), aes(x = D1, y = D2)) +
  geom_text(as_tibble(mds_euc$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
  theme_minimal() +
  labs(title = "MDS with Euclidean Distance Matrix")

MDS with Manhattan Distance

mds_man <- bigram_dtm %>% 
  stats::dist(method = "manhattan") %>% 
  # t() %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_man$conf), aes(x = D1, y = D2)) +
  geom_text(as_tibble(mds_man$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
  theme_minimal() +
  geom_text_repel() +
  labs(title = "MDS with Manhattan Distance Matrix")

MDS with 1 - Cosine Similarity

library(slam)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(t(bigram_dtm))/(sqrt(col_sums(t(bigram_dtm)^2) %*% t(col_sums(t(bigram_dtm)^2))))

mds_cos <- cosine_dist_mat %>% 
  # t() %>% 
  mds(type = "ordinal")


ggplot() +
  geom_point(data = as_tibble(mds_cos$conf), aes(x = D1, y = D2)) +
  geom_text(as_tibble(mds_cos$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
  geom_text_repel() +
  theme_minimal() +
  labs(title = "MDS with 1 - Cosine Similarity")

MDS with 1 - Jaccard Similarity

Still trying to figure this one out

# mds_jac <- bigram_dtm %>% 
#   dist(method = "Jaccard", pairwise = TRUE) %>% 
#   # t() %>% 
#   mds(type = "ordinal")
# 
# 
# ggplot() +
#   geom_point(data = as_tibble(mds_jac$conf), aes(x = D1, y = D2)) +
#   geom_text(as_tibble(mds_jac$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
#   geom_text_repel() +
#   theme_minimal() +
#   labs(title = "MDS with 1 - Jaccard Similarity")

MDS with Course Descriptions

MDS with 1 - Cosine Similarity

mds_cos_mat <- cos_mat %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_cos_mat$conf), aes(x = D1, y = D2, colour = D2 > 0.5)) +
  scale_colour_manual(values = setNames(c('#532d8e','grey'),c(T, F))) +
  scale_alpha_manual(values = c(1, 0.01)) +
  geom_text(as_tibble(mds_cos_mat$conf), mapping = aes(
    x = -D1, y = -D2, color = D2 < -0.5, label = paste(rownames(mds_cos_mat$conf))), alpha = .7) +
  geom_text_repel() +
  theme_minimal() +
  labs(title = "MDS with 1 - Cosine Similarity") +
  theme(legend.position = "")

ggsave(paste0("final-doc/Content/images/", 
              ifelse(
                params$department == "Computer Science",
                "cos_mds_cs.png",
                "cos_mds.png"
              )))

ggsave("img/cos_mds.png", dpi = 300) 

MDS with 1 - Jaccard Similarity

mds_jac_mat <- jac_mat %>% 
  mds(type = "ordinal")

ggplot() +
  geom_point(data = as_tibble(mds_jac_mat$conf), aes(x = D1, y = D2, colour = D2 > 0.5 | D2 < -0.55)) +
  scale_colour_manual(values = setNames(c('#532d8e','grey'),c(T, F))) +
  scale_alpha_manual(values = c(1, 0.01)) +
  geom_text(as_tibble(mds_jac_mat$conf), mapping = aes(
    x = -D1, y = -D2, color = D2 < -0.5  | D2 > 0.55, label = paste(rownames(mds_jac_mat$conf))), alpha = .7) +
  geom_text_repel() +
  theme_minimal() +
  labs(title = "MDS with 1 - Jaccard Similarity") +
  theme(legend.position = "")

ggsave(paste0("final-doc/Content/images/", 
              ifelse(
                params$department == "Computer Science",
                "jac_mds_cs.png",
                "jac_mds.png"
              )))

ggsave("img/jac_mds.png", dpi = 300)